library(data.table)
library(ggplot2)
library(knitr)
library(ggrepel)
library(RColorBrewer)
library(DESeq2)
#library(rnaseqGene)
library(plotly)
library(Rtsne)
library(scales)
library(dplyr)

rm(list = ls())

setDTthreads(8)

data.output.dir <- file.path(here::here(
  '..','..',
  's3-roybal-tcsl',
  'lenti_screen_compiled_data','data'))
load(file=file.path(data.output.dir, 'pooled_analysis_data.Rdata'))

# set this to true to re-run, else will load from s3
rerun_deseq <- F

# arrayed list of CARs
array.list <- c('BAFF-R','TNR8','4-1BB','TACI','CD28','KLRG1','CD40')

# load colors
source(here::here('r','fig_colors.R'))

deSeq2 normalization followed by CAR score calculation

We’d like to use DESeq2’s normalization for CAR-score and to properly normalize the baseline measurements.

This can be done with either VST or rlog.

https://www.biostars.org/p/188808/ https://support.bioconductor.org/p/126713/

cts <- dcast(
    read.counts[, .(
            CAR.align, 
            sort.group.bin, 
            k.type, t.type, batch, assay, donor, 
            sort.group, 
            bin,
            counts)], 
    CAR.align ~ sort.group.bin, value.var='counts')

cts <- data.frame(cts[, -1], row.names = cts[, CAR.align])
cts[is.na(cts)] <- 0

vst_cts <- reshape2::melt(varianceStabilizingTransformation(as.matrix(cts)))
## converting counts to integer mode
names(vst_cts) <- c('CAR.align','sort.group.bin', 'vst')
rlog_cts <- reshape2::melt(rlog(as.matrix(cts)))
## rlog() may take a long time with 50 or more samples,
## vst() is a much faster transformation
## converting counts to integer mode
names(rlog_cts) <- c('CAR.align','sort.group.bin', 'rlog')
rlog_cts$CAR.align <- factor(rlog_cts$CAR.align, levels=1:40, 
  labels=levels(vst_cts$CAR.align))

norm_cts <- merge(
  vst_cts, rlog_cts, by=c('CAR.align','sort.group.bin'))

# convert 'post.cytof' to 'post-cytof'
levels(norm_cts$sort.group.bin) <- gsub(
  '\\.','-',levels(norm_cts$sort.group.bin))

read.counts <- merge(
  read.counts, norm_cts, by=c('CAR.align','sort.group.bin'), all=T)
read.counts <- read.counts[!is.na(t.type)]

#prolif 1 & 2

read.counts[, 
  vst_car_score := sum((vst - mean(vst)) * sqrt(ctv.bin.score)),
  by=c('CAR.align','sort.group')]

read.counts[, 
  rlog_car_score := sum((rlog - mean(rlog)) * sqrt(ctv.bin.score)),
  by=c('CAR.align','sort.group')]

## post-cytof

read.counts[batch=='post-cytof', 
  vst_car_score := sum((vst - mean(vst)) * sqrt(bin.score)),
  by=c('CAR.align','sort.group')]

read.counts[batch=='post-cytof', 
  rlog_car_score := sum((rlog - mean(rlog)) * sqrt(bin.score)),
  by=c('CAR.align','sort.group')]

# save an uneditied copy of read.counts for cytokines later
read.counts.complete <- copy(read.counts)

ggplot(read.counts) + 
    geom_boxplot(
        aes(x=reorder(CAR.align,vst_car_score), y=vst_car_score)) + 
    coord_flip() + 
    facet_wrap(t.type ~ k.type) + 
    facet_grid(t.type ~ assay + k.type)

ggplot(read.counts) + 
    geom_boxplot(
        aes(x=reorder(CAR.align,rlog_car_score), y=rlog_car_score)) + 
    coord_flip() + 
    facet_wrap(t.type ~ k.type) + 
    facet_grid(t.type ~ assay + k.type)

ggplot(read.counts) + 
    geom_point(
        aes(x=rlog_car_score, y=CAR.score, color=interaction(donor, batch))) + 
    coord_flip() + 
    facet_wrap(t.type ~ k.type) + 
    facet_grid(t.type ~ assay + k.type)

ggplot(read.counts) + 
    geom_point(
        aes(x=rlog_car_score, y=vst_car_score, color=interaction(donor, batch))) + 
    coord_flip() + 
    facet_wrap(t.type ~ k.type) + 
    facet_grid(t.type ~ assay + k.type)

Raw bin plot (for fig1?)

subset_barplot_reads <- read.counts[
  assay == 'CTV1' & batch == 'prolif2' & donor == 'd1' & k.type == 'pos']

subset_barplot_reads <- subset_barplot_reads[,
  CAR.ranked.sep := factor(
    paste(CAR.align,t.type,sep='|'))]

subset_barplot_reads[,
  CAR.ranked.sep := factor(CAR.ranked.sep, levels=levels(
    reorder(
      CAR.ranked.sep[k.type == 'pos'], 
      -rlog_car_score[k.type == 'pos'])))]

remove_t_type_sep <- function (breaks)
  unlist(lapply(breaks, function(str)
    strsplit(str, '|',fixed=T)[[1]][1]))

barplot_stacked_plot <- ggplot(subset_barplot_reads) +
  geom_bar(aes(y=car.bin.pct, x=CAR.ranked.sep, fill=bin),
    stat='identity', width=1) +
  scale_fill_brewer(palette = "YlGn") +
  facet_wrap(. ~ t.type, scales = 'free_x', ncol=1) +
  labs(y='% sorted into each bin', x='CAR Costimulatory Domains, ranked') +
  scale_x_discrete(expand = c(0, 0), labels=remove_t_type_sep) +
  scale_y_continuous(limits = c(0,1), expand = expansion(mult = c(0, 0)), 
                     breaks=c(0, .5, 1), labels=percent) +
  theme_bw(base_size=18) + theme(axis.ticks = element_blank()) + 
  theme(
    axis.text.x = element_text(size=9, angle = 90, vjust = 0.5, hjust=1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_blank(),
    panel.border = element_rect(colour = "black"))

ggsave(here::here('..','figs','pooled','fig1_stacked_bar.pdf'), 
       barplot_stacked_plot, height=3, width=9, useDingbats=FALSE)

DeSeq2 Computation

DESeq2 Function

run_deseq <- function(data.dt, ref_bin, test_bin, 
    control_replicates = T,
    interaction = T, group.control = F, weight.bins = F) 
{
  
  # identify inputs
  assay_input <- as.character(unique(data.dt[, assay]))
  k_type <- as.character(unique(data.dt[, k.type]))
  t_type <- as.character(unique(data.dt[, t.type]))
  
  ## 1. Do bin normalization weights =======
  # bin normalization weights
  data.weights <- unique(data.dt[, list(
    batch, donor, timepoint, assay, t.type, k.type, 
    sort.group, bin, bin.pct, bin.reads)])[, 
      list(bin, bin.reads, bin.pct, sort.group,
        read.weight=bin.pct * bin.reads / sum(bin.pct * bin.reads)),
      by=.(batch, donor, timepoint, assay, t.type, k.type)][, 
        read.weight.norm := read.weight/exp(mean(log(read.weight))), 
        by=.(batch, donor, timepoint, assay, t.type, k.type)]
  
  stopifnot(nrow(interaction(assay_input, k_type, t_type)) == 1)
  
  # prepare cts and coldata dataframes
  
  ## 2. Prepare Ref Bins ============
  if(length(ref_bin) == 1 & ref_bin[1] == 'baseline') {
    # reference is baseline
    # get baseline counts per donor/assay replicate
    ref.bin.dt <- dcast(
      data.dt[
        bin == 'D' & assay == assay_input &
          k.type == k_type & t.type == t_type, 
        .(
          CAR.align, 
          bin.sort.group = paste(
            batch, donor, timepoint, assay, t.type, 'base', sep = '_'), 
          k.type, t.type, batch, assay, donor, 
          sort.group, 
          bin = 'base',
          counts = baseline.counts)], 
      CAR.align ~ bin.sort.group, value.var='counts')
    
    ref.weights <- rep(1, ncol(ref.bin.dt))
    
  } else {
    # reference is a specified bin
    ref.bin.dt <- dcast(
      data.dt[
        bin %in% ref_bin & assay == assay_input &
          k.type == k_type & t.type == t_type, 
        .(
          CAR.align, 
          bin.sort.group = paste(sort.group, bin, sep = '_'), 
          k.type, t.type, batch, assay, donor, 
          sort.group, 
          bin,
          counts)], 
      CAR.align ~ bin.sort.group, value.var='counts')
    
    ref.weights <- dcast(data.weights[
        bin %in% ref_bin & assay == assay_input &
            k.type == k_type & t.type == t_type, 
        .(
          bin.sort.group = paste(sort.group, bin, sep = '_'), 
          k.type, t.type, batch, assay, donor, 
          sort.group,
          bin,
          read.weight.norm)],
        . ~ bin.sort.group, 
        value.var = 'read.weight.norm')
    
    stopifnot(nrow(ref.bin.dt) == nrow(unique(ref.bin.dt)))
  }
  
  # copy the ref bin columns for each of the test bin columns
  if (interaction == T) {
    
    num.ref.reps <- ncol(ref.bin.dt) - 1
    
    ref.bin.dt <- cbind(ref.bin.dt[, 1], 
      do.call("cbind", replicate(length(test_bin), 
      ref.bin.dt[, -1], simplify = FALSE)))
    
    names(ref.bin.dt) <- c(names(ref.bin.dt[, 1]), 
      paste(names(ref.bin.dt[, -1]), rep(test_bin, each=num.ref.reps), 
        sep = '_'))
  }
  
  ## 3. Prepare Test Bins ============
  test.bin.dt <- dcast(
      data.dt[
        bin %in% test_bin & assay == assay_input &
          k.type == k_type & t.type == t_type, 
        .(
          CAR.align, 
          bin.sort.group = paste(sort.group, bin, sep = '_'), 
          k.type, t.type, batch, assay, donor, 
          sort.group, 
          bin,
          counts)], 
      CAR.align ~ bin.sort.group, value.var='counts')
  
  # check that replicate counts match
  stopifnot(nrow(ref.bin.dt) == nrow(unique(ref.bin.dt)))
  stopifnot(nrow(test.bin.dt) == nrow(unique(test.bin.dt)))
  
  ## 4. Merge and create design matrix ============
  cts <- merge(ref.bin.dt, test.bin.dt, by = 'CAR.align')
  cts <- data.frame(cts[, -1], row.names = cts[, CAR.align])
  cts[is.na(cts)] <- 0
  
  coldata <- data.frame(
    condition = c(
      rep('reference', ncol(ref.bin.dt) - 1), 
      rep('test', ncol(test.bin.dt) - 1)), 
    rep = data.table(t(sapply(strsplit(c(
      names(ref.bin.dt)[-1], 
      names(test.bin.dt)[-1]),"_"), `[`, c(1,2))))[,
        paste(V1, V2, sep='_')],
    bin = sapply(strsplit(c(
      names(ref.bin.dt)[-1], 
      names(test.bin.dt)[-1]),"_"), `[`, 7),
    row.names = c(names(ref.bin.dt)[-1], names(test.bin.dt)[-1]))
  
  dds <- DESeqDataSetFromMatrix(countData = cts,
                                colData = coldata,
                                design =  ~ condition + rep)
  
  # set reference
  dds$condition <- relevel(dds$condition, ref = "reference")
  
  print(coldata)
  
  # pre-filtering
  keep <- rowSums(counts(dds)) >= 10
  dds <- dds[keep,]
  
  # try various designs of decreasing complexity, since we don't have a
  # full matrix for every example deseq run.
  
  # try_designs <- function(designs, dds) tryCatch({
  #   message(paste("trying design:",as.character(designs[[1]]),"\n"))
  #   DESeq2::design(dds) <- designs[[1]]
  #   return(DESeq2::DESeq(dds)) }, error= function(e) {
  #     dds <- try_designs(designs[c(-1)], dds)
  #     message(e)
  #     message('getting here\n')
  #     return(dds)
  #   }
  # )
  #     
  # try_designs(
  #   designs=c(
  #     ~ condition + condition:rep + condition:bin,
  #     ~ condition + rep + bin,
  #     ~ condition + rep,
  #     ~ condition
  #   ),
  #   dds
  # )
  
  # if (control_replicates) {
  #     design(dds) <- ~ condition + rep
  # }
  # 
  # #check unique bins before using bins as contrast
  # n_uniq_bins <- length(unique(coldata$bin))
  # 
  # if (n_uniq_bins == 1 & interaction == T) {
  #   warning('Cannot use bin contrast, only one bin level.')
  # }
  # 
  # if (interaction == T & group.control == T & n_uniq_bins > 1) {
  #   design(dds) <- ~ condition + condition:rep + condition:bin
  # } else if (interaction == T & n_uniq_bins > 1) {
  #   design(dds) <- ~ condition + rep + bin
  # }

  dds <- DESeq2::DESeq(dds)
  res <- results(dds)
  
  # shrink log fold change
  resLFC <- lfcShrink(dds, 
    coef="condition_test_vs_reference", type="apeglm")
  
  # convert to data.table
  results.dt <- as.data.table(resLFC)[, CAR.align := row.names(resLFC)]
  results.dt <- cbind(results.dt[, 6], results.dt[, -6])
  results.dt[, assay := assay_input][, k.type := k_type][, t.type := t_type]
  
  return(results.dt)
}

Run DESeq2 on all comparisons

if (rerun_deseq) {
  test_ref_sets <- c(ref=list(), test=list())
  
  # A/B/AB vs C/D/CD
  test_ref_sets$ref <- c(as.list(rep('D',3)), as.list(rep('C',3)), rep(list(c('C','D')),3))
  test_ref_sets$test <- rep(list('A','B',c('A','B')),3)
  test_ref_sets$interaction <- as.list(rep(F, 9))
  
  # A/B/AB/ABCD vs baseline
  test_ref_sets$ref <- c(test_ref_sets$ref, as.list(rep('baseline',3)))
  test_ref_sets$test <- c(test_ref_sets$test, list('A','B',c('A','B','C','D')))
  test_ref_sets$interaction <- c(test_ref_sets$interaction, as.list(rep(T, 3)))
  
  all.deseq.results.dt <- data.table()
  
  for (set_i in seq_along(test_ref_sets$ref)) {
      
    ref_set <- test_ref_sets$ref[[set_i]]
    test_set <- test_ref_sets$test[[set_i]]
    
    ref_str <- paste0(ref_set, collapse='')
    test_str <- paste0(test_set, collapse='')
    
    inter <- test_ref_sets$interaction[[set_i]]
    
    deseq.results.dt <- read.counts[batch != 'post-cytof' & !is.na(k.type),
      {
        message(paste(c(ref_str, test_str, inter, .BY[1],"\n"), collapse= ' - '));
        tryCatch(
          run_deseq(
            data.dt = .SD, 
            ref_bin = ref_set,
            test_bin = test_set,
            interaction = inter,
            group.control = T),
          error= function(e) {message(e); return(data.table())}
        )
      },
      by = .(group)]
    
    if (nrow(deseq.results.dt) > 0) {
      deseq.results.dt[,
        `:=`(
          ref_set = ref_str,
          test_set = test_str,
          inter = inter)]
    }
    
    all.deseq.results.dt <- rbind(
      all.deseq.results.dt,
      deseq.results.dt, fill=T)
  }
  
  save(list=c('all.deseq.results.dt'), 
     file=file.path(data.output.dir, 'pooled_deseq2_data.Rdata'))
}

Plots

if (!rerun_deseq) load(
  file=file.path(data.output.dir, 'pooled_deseq2_data.Rdata'))

#add back CAR scores
cols_to_add <- c('CAR.score','sort.group','donor','batch')
cols_to_join <- c('group', 'CAR.align', 'assay', 'k.type', 't.type')


all.deseq.results.dt[, padj.disp := -log10(padj)]
all.deseq.results.dt[, lfc.disp := log2FoldChange]
all.deseq.results.dt[padj.disp > 10, padj.disp := Inf]
all.deseq.results.dt[abs(lfc.disp) > 5, lfc.disp := sign(lfc.disp) * Inf]

# mask receptor names except for known ones
control_domains <- c('4-1BB','CD28')
chosen_domains <- c('BAFF-R','CD40','TACI','TNR8')
neg_domain <- c('KLRG1')

all.deseq.results.dt[, CAR.type := 'other']
all.deseq.results.dt[CAR.align %in% control_domains, CAR.type := 'control']
all.deseq.results.dt[CAR.align %in% chosen_domains, CAR.type := 'chosen']
all.deseq.results.dt[CAR.align %in% neg_domain, CAR.type := 'neg']
all.deseq.results.dt[, 
  CAR.type := factor(CAR.type,levels=c('other','control','chosen','neg'))]

make_volcanoes <- function(data.dt) {
  ggplot(data.dt, aes(
    x=lfc.disp, y=padj.disp, 
    color=CAR.type,
    label=CAR.align,
    size=CAR.type)) + 
  geom_point() +
  geom_hline(yintercept=-log10(0.05), linetype=2) +
  facet_grid(test_set + ref_set ~ t.type + assay + k.type) +
  scale_color_manual('',
    labels=c('Other Receptors', 'CD28/4-1BB', 'New Receptors','Negative'),
    values=c('grey50', RColorBrewer::brewer.pal(5, 'Paired')[c(2,4,5)])) +
  scale_size_manual('',
    labels=c('Other Receptors', 'CD28/4-1BB', 'New Receptors','Negative'),
    values=c(1,3,3,3)) +
  labs(x='Log2 FC', y='-log10(P-value)', title='Assay Volcano Plots')
}

make_timeseries <- function(data.dt) {
  ggplot(data.dt, aes(
    y=lfc.disp, x=assay, 
    color=CAR.type,
    group=CAR.align,
    label=CAR.align,
    size=CAR.type)) + 
    geom_point() +
    geom_line() +
    facet_grid(t.type ~ test_set + ref_set) +
    scale_color_manual('',
      labels=c('Other Receptors', 'CD28/4-1BB', 'New Receptors','Negative'),
      values=c('grey50', RColorBrewer::brewer.pal(5, 'Paired')[c(2,4,5)])) +
    scale_size_manual('',
      labels=c('Other Receptors', 'CD28/4-1BB', 'New Receptors','Negative'),
      values=c(0.5,1,1,1)) +
    labs(y='Log2 FC', x='Assay', title='Log fold change across assays')

}

make_cd4_cd8 <- function(data.dt) {
  ggplot(
    dcast(data.dt, 
    CAR.align + assay + k.type + ref_set + test_set + inter + CAR.type ~ t.type, 
    value.var = c("log2FoldChange", "padj.disp")), 
  aes(y=log2FoldChange_CD8, x=log2FoldChange_CD4,
      color=CAR.type,
      label=CAR.align,
      size=CAR.type)) + 
  geom_point() +
  facet_grid(test_set + ref_set ~  assay + k.type) +
  scale_color_manual('',
    labels=c('Other Receptors', 'CD28/4-1BB', 'New Receptors','Negative'),
    values=c('grey50', RColorBrewer::brewer.pal(5, 'Paired')[c(2,4,5)])) +
  scale_size_manual('',
    labels=c('Other Receptors', 'CD28/4-1BB', 'New Receptors','Negative'),
    values=c(1,3,3,3)) +
  labs(x='CD4', y='CD8', title='Log fold change, CD4 vs CD8')
}

make_pos_neg <- function(data.dt) {
  ggplot(
    dcast(data.dt, 
    CAR.align + assay + t.type + ref_set + test_set + inter + CAR.type ~ k.type, 
    value.var = c("lfc.disp", "padj.disp")), 
  aes(y=lfc.disp_pos, x=lfc.disp_neg,
      color=CAR.type,
      label=CAR.align,
      size=CAR.type)) + 
  geom_point() +
  facet_grid(test_set + ref_set ~ t.type + assay) +
  scale_color_manual('',
    labels=c('Other Receptors', 'CD28/4-1BB', 'New Receptors','Negative'),
    values=c('grey50', RColorBrewer::brewer.pal(5, 'Paired')[c(2,4,5)])) +
  scale_size_manual('',
    labels=c('Other Receptors', 'CD28/4-1BB', 'New Receptors','Negative'),
    values=c(1,3,3,3)) +
  labs(x='CD19-', y='CD19+', title='Log fold change, CD19+ vs CD19-')
}

Replicates comparisons

Comparisons on x/y, all combos

x_group <- 'prolif2_d1_3d_CTV1_CD4_neg'
y_group <- 'prolif2_d1_3d_CTV1_CD4_pos'

cast_comparison <- function(
    comp.df, x_group, y_group, value_col='CAR.score', xycols=c('x','y'),
    rescale= 'combined') {
  
  #message(paste(x_group, y_group, sep=', '))
  
  cast_groups <- dcast(
    unique(comp.df[sort.group %in% c(x_group, y_group), 
        list(CAR.align, sort.group, get(value_col))]), 
    CAR.align ~ sort.group, value.var = 'V3')[, 
      `:=`(x.group= x_group, y.group= y_group)]
  
  names(cast_groups)[c(2,3)] <- xycols
  
  stopifnot(rescale %in% c('combined','separate'))
  
  # rescle == combined:
  # rescale both x and y to (0,1) on same scale
  xy_scalemin = cast_groups[, min(c(x,y), na.rm=T)]
  xy_scalemax = cast_groups[, max(c(x,y), na.rm=T)]
  tryCatch({
    cast_groups[, x_scaled_comb := scales::rescale(
      x, from=c(min(c(x,y), na.rm=T), max(c(x,y), na.rm=T)))]
    cast_groups[, y_scaled_comb := scales::rescale(
      y, from=c(min(c(x,y), na.rm=T), max(c(x,y), na.rm=T)))]
  }, error = function(e) {
    message(e)
    cast_groups[, x_scaled_comb := NaN]
    cast_groups[, y_scaled_comb := NaN]}
  )
    
  # rescle == separate:
  # rescale x and y to (0,1) on individual scales
  tryCatch({
    cast_groups[, x_scaled_sep :=  scales::rescale(x)]
    cast_groups[, y_scaled_sep :=  scales::rescale(y)]
  }, error = function(e) {
    message(e)
    cast_groups[, x_scaled_sep := NaN]
    cast_groups[, y_scaled_sep := NaN]}
  )
}

All Replicate Comparisons

# use malanhanobis distance to collapse replicates, 
# then use median absolute deviation to identify outliers
#https://www.r-craft.org/r-news/combined-outlier-detection-with-dplyr-and-ruler/

maha_dist <- . %>% select_if(is.numeric) %>%
    mahalanobis(center = colMeans(.), cov = cov(.))

isnt_out_maha <- function(tbl, isnt_out_f, ...) {
  tbl %>% maha_dist() %>% isnt_out_f(...)
}

isnt_out_mad <- function(x, thres = 3, na.rm = TRUE) {
  abs(x - median(x, na.rm = na.rm)) <= thres * mad(x, na.rm = na.rm)
}

top_x_mad <- function(x, top=5, na.rm = TRUE) {
  mads <- abs(x - median(x, na.rm = na.rm))
  top_mads <- order(-mads)[1:top]
  return(!(1:length(mads) %in% top_mads))
}


plot_all_reps <- function(df=read.counts, value_col, df_only=F) {
  
  all_rep_comparisons <- df[
    grepl('CTV', assay)][order(assay)][order(t.type)][order(k.type)][, 
    data.table(matrix(combn(unique(sort.group), 2), ncol=2, byrow=T)),
    by=c('assay','t.type','k.type')]
  
  names(all_rep_comparisons)[4:5] <- c('x.group','y.group')
  
  all_rep_comparisons <- all_rep_comparisons[, 
    cast_comparison(df, x.group, y.group, value_col=value_col)[, 
      c('x.group','y.group') := NULL],
    by=c('assay','t.type','k.type','x.group','y.group')]
  
  # combine with baseline abundance as color
  baseline.abund <- df[
    assay=='baseline', list(mean.baseline= mean(car.abund.baseline, na.rm=T)),
    by=c('t.type','CAR.align')][,
      list(CAR.align, 
        rel.baseline.log= log10(mean.baseline/mean(mean.baseline))), 
      by=c('t.type')]
  
  all_rep_comparisons <-all_rep_comparisons[
    baseline.abund, on=c('t.type','CAR.align')]
  
  all_rep_comparisons[, rep_pair := paste(
    gsub('(prolif\\d_d\\d).*','\\1', x.group), 
    gsub('(prolif\\d_d\\d).*','\\1', y.group), 
    sep='\n')]
  
  all_rep_comparisons[
    rep_pair == "prolif2_d2\nprolif1_d2", rep_pair := "prolif1_d2\nprolif2_d2"] 
  
  all_rep_comparisons[, assay_kt := paste(assay,t.type,k.type, sep='\n')]
  
  # use malanhanobis distance to collapse replicates, 
  # then use median absolute deviation to identify outliers
  all_rep_comparisons[,
    outlier := {   
      
      nonsingular <- apply(tibble(x_scaled_sep, y_scaled_sep), 2, 
        function (x) var(x) > 0 & !is.na(var(x)))
      non_singular_sd <- tibble(x_scaled_sep, y_scaled_sep)[,
        (nonsingular)]
      !isnt_out_maha(non_singular_sd, top_x_mad)
    }, by=.(rep_pair, k.type, t.type, assay)]
  
  all_rep_comparisons[, label_outliers := ''][outlier == T, 
    label_outliers := CAR.align]
  
  all_rep_comparisons[, label_arrayed := ''][
    CAR.align %in% array.list & !outlier, 
    label_arrayed := CAR.align]
  
  non_na_comps <- all_rep_comparisons[, 
    !any(is.na(list(var(x_scaled_sep), var(y_scaled_sep)))), 
    by=.(rep_pair, k.type, t.type, assay)]
  
  if (df_only) return(all_rep_comparisons)
  
  r_squareds <- all_rep_comparisons[
    non_na_comps[V1==T], on=.(rep_pair, k.type, t.type, assay)][V1 == T][, 
      list(r=sqrt(summary(lm(x_scaled_sep ~ y_scaled_sep))$r.squared)), 
      by=.(rep_pair, k.type, t.type, assay)]
  
  r_squareds[, r_lbl := paste('r=',as.character(round(r, 2)))]
  
  return(ggplot(data=all_rep_comparisons, 
      aes(x_scaled_sep, y_scaled_sep)) + 
    geom_point(shape = 21, colour = "grey30", aes(fill=rel.baseline.log)) + 
    geom_text_repel(size=2.5, color='grey20', aes(label=label_outliers)) +
    geom_text_repel(size=2.5, color='lightsalmon4', aes(label=label_arrayed)) + 
    theme_bw() + 
    scale_fill_distiller(palette='BrBG', 
      limit=c(-1,1) * max(abs(all_rep_comparisons$rel.baseline.log))) +
    geom_label(size=2.5, aes(x=Inf, y=-Inf, label=r_lbl), data=r_squareds,
      hjust=1, vjust=0) +
    facet_grid(rep_pair ~  k.type + t.type + assay))

}
read.counts[, car.abund.log := log10(car.abund)]

#assay_rep_list:
assay_rep_set <- unique(read.counts[
  data.table(
    assay=c('baseline','CTV1','CTV2','CTV3'), 
    prev_assay=c(NA,'baseline','CTV1','CTV2')), on='assay'][, 
  list(assay, sort.group, prev_assay, donor, batch, t.type, k.type)])

#map day0 
assay_rep_set <- rbind(
  assay_rep_set[assay=='baseline'][, k.type := 'pos'], 
  assay_rep_set[assay=='baseline'][, k.type := 'neg'], 
  assay_rep_set[assay!='baseline'])

#merge with prev copy to get prev sort.group correspondence
assay_rep_set <- assay_rep_set[, list(
    donor, batch, t.type, k.type,
    prev_assay=assay,prev.sort.group=sort.group)][
  assay_rep_set,
  on=c('donor','batch','t.type','k.type','prev_assay')]

# use baseline from prolif2 always
assay_rep_set[assay == 'CTV1', 
  prev.sort.group := gsub('prolif1','prolif2',prev.sort.group)]

#merge with prev measurements
prev_measure <- unique(read.counts[, list(
  car.abund.prev=car.abund, prev.sort.group=sort.group, CAR.align)])[
    assay_rep_set, on=c('prev.sort.group')]

#merge with orig read counts
read.counts <- prev_measure[, list(
    car.abund.prev, prev.sort.group, CAR.align, sort.group)][
      read.counts, on=c('sort.group','CAR.align')]

#calculate relative prev
read.counts[, car.abund.rel.prev := car.abund/car.abund.prev]


plot_all_reps(value_col='CAR.score')  +
  labs(title='CAR score replicate comparison')
## Warning in summary.lm(lm(x_scaled_sep ~ y_scaled_sep)): essentially perfect fit:
## summary may be unreliable
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).

## Warning: Removed 1 rows containing missing values (geom_text_repel).

plot_all_reps(value_col='car.abund') +
  labs(title='CAR Abundance replicate comparison')
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_text_repel).

## Warning: Removed 1 rows containing missing values (geom_text_repel).

plot_all_reps(value_col='car.abund.log') +
  labs(title='CAR Log Abundance replicate comparison')
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_text_repel).

## Warning: Removed 1 rows containing missing values (geom_text_repel).

plot_all_reps(value_col='car.abund.rel.baseline') +
  labs(title='CAR Relative Abundance Change to Baseline, replicate comparison')
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_text_repel).

## Warning: Removed 1 rows containing missing values (geom_text_repel).

plot_all_reps(value_col='car.abund.rel.prev') +
  labs(title='CAR Relative Abundance Change to Previous, replicate comparison')
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: Removed 81 rows containing missing values (geom_point).
## Warning: Removed 81 rows containing missing values (geom_text_repel).

## Warning: Removed 81 rows containing missing values (geom_text_repel).

plot_all_reps(value_col='vst_car_score')  +
    labs(title='VST-normalized CAR score replicate comparison')
## Warning in summary.lm(lm(x_scaled_sep ~ y_scaled_sep)): essentially perfect fit:
## summary may be unreliable
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).

## Warning: Removed 1 rows containing missing values (geom_text_repel).

plot_all_reps(value_col='rlog_car_score')  +
    labs(title='rlog-normalized CAR score replicate comparison')
## Warning in summary.lm(lm(x_scaled_sep ~ y_scaled_sep)): essentially perfect fit:
## summary may be unreliable
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).

## Warning: Removed 1 rows containing missing values (geom_text_repel).

Combining these together to summarize correlation among replicates

ggplot(data=plot_all_reps(df=read.counts[(CAR.align != 'TNR8')], 
        value_col='car.abund.rel.baseline', df_only=T), 
       aes(x_scaled_sep, y_scaled_sep)) + 
    geom_point(shape = 21, colour = "grey30", aes(fill=rep_pair)) + 
    geom_text_repel(size=2.5, color='grey20', aes(label=label_outliers)) +
    geom_text_repel(size=2.5, color='lightsalmon4', aes(label=label_arrayed)) + 
    theme_bw() + 
    facet_grid(k.type + t.type ~ assay)
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).

## Warning: Removed 1 rows containing missing values (geom_text_repel).

ggplot(data=plot_all_reps(df=read.counts[(CAR.align != 'TNR8')], 
        value_col='car.abund.rel.prev', df_only=T), 
       aes(x_scaled_sep, y_scaled_sep)) + 
    geom_point(shape = 21, colour = "grey30", aes(fill=rep_pair)) + 
    geom_text_repel(size=2.5, color='grey20', aes(label=label_outliers)) +
    geom_text_repel(size=2.5, color='lightsalmon4', aes(label=label_arrayed)) + 
    theme_bw() + 
    facet_grid(k.type + t.type ~ assay)
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: Removed 79 rows containing missing values (geom_point).
## Warning: Removed 79 rows containing missing values (geom_text_repel).

## Warning: Removed 79 rows containing missing values (geom_text_repel).

ggplot(data=plot_all_reps(df=read.counts, 
        value_col='rlog_car_score', df_only=T), 
       aes(x_scaled_sep, y_scaled_sep)) + 
    geom_point(shape = 21, colour = "grey30", aes(fill=rep_pair)) + 
    geom_text_repel(size=2.5, color='grey20', aes(label=label_outliers)) +
    geom_text_repel(size=2.5, color='lightsalmon4', aes(label=label_arrayed)) + 
    theme_bw() + 
    facet_grid(k.type + t.type ~ assay)
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).

## Warning: Removed 1 rows containing missing values (geom_text_repel).

For figure 2B, we will combine rlog_car_score for CTV1 & 2 with baseline abundance for CTV3.

Individual Replicate, comparison among different measurements

comp.df <- copy(read.counts[assay != 'baseline' & batch != 'post-cytof'])
comp.df[, car.abund.rel.prev.scaled := scales::rescale(car.abund.rel.prev), 
  by=sort.group]
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
comp.df[, car.abund.rel.baseline.scaled := scales::rescale(car.abund.rel.baseline),
  by=sort.group]
comp.df[, CAR.score.scaled := scales::rescale(CAR.score), 
  by=sort.group]
comp.df[, vst_car_score.scaled := scales::rescale(vst_car_score), 
  by=sort.group]
comp.df[, rlog_car_score.scaled := scales::rescale(rlog_car_score), 
  by=sort.group]

baseline.abund <- read.counts[
  assay=='baseline', list(mean.baseline= mean(car.abund.baseline, na.rm=T)),
  by=c('t.type','CAR.align')][,
    list(CAR.align, 
      rel.baseline.log= log10(mean.baseline/mean(mean.baseline))), 
    by=c('t.type')]

comp.df <-comp.df[baseline.abund, on=c('t.type','CAR.align')]

plot_measure_pair <- function(df=comp.df, measure_i, measure_j) {
  
  # fix overplotting:
  df <- unique(df[!is.na(get(measure_i)) & !is.na(get(measure_j)), list(
    get(measure_i), get(measure_j), 
    rel.baseline.log,  CAR.align,
    batch, donor, k.type, t.type, assay)])
  
  names(df)[c(1,2)] <- c(measure_i, measure_j)
  
  # use malanhanobis distance to collapse replicates, 
  # then use median absolute deviation to identify outliers
  df[, outlier := {
    
    #print(.BY)
    #print(tibble(get(measure_i), get(measure_j)))
    
      nonsingular <- apply(tibble(get(measure_i), get(measure_j)), 2, 
        function (x) var(x) > 0 & !is.na(var(x)))
      
      non_singular_sd <- tibble(get(measure_i), get(measure_j))[,
        (nonsingular)]
      
      # if both measures are singular
      if (!any(nonsingular)) (FALSE)
      # for specific base of prev vs baseline and CTV1, will be singular
      else if ((ncol(non_singular_sd) == 2 &&
            all(non_singular_sd[,1] == non_singular_sd[,2])) | 
          ncol(non_singular_sd) != 2) {
        !top_x_mad(pull(non_singular_sd, 1))
      } else {
        !isnt_out_maha(non_singular_sd, top_x_mad)
      }
    }, by=.(batch, donor, k.type, t.type, assay)]
  
  df[, label_outliers := ''][outlier == T, 
    label_outliers := CAR.align]
  
  df[, label_arrayed := ''][
    CAR.align %in% array.list & !outlier,  
    label_arrayed := CAR.align]
  
  r_squareds <- df[, 
    list(r=sqrt(summary(lm(data=.SD, 
      as.formula(paste(measure_i, "~", measure_j))))$r.squared)), 
   by=.(batch, donor, k.type, t.type, assay)]
  
  r_squareds[, r_lbl := paste('r=',as.character(round(r, 2)))]
  
  return(ggplot(df, aes_string(
      x=measure_i, y=measure_j)) + 
    geom_point(data=df, shape = 21, colour = "grey30", aes(fill=rel.baseline.log)) + 
    geom_text_repel(data=df, size=2.5, color='grey20', aes(label=label_outliers)) +
    geom_text_repel(data=df, size=2.5, color='lightsalmon4', aes(label=label_arrayed)) + 
    theme_bw() +
    geom_label(size=2.5, aes(x=Inf, y=-Inf, label=r_lbl), data=r_squareds,
      hjust=1, vjust=0) +
    scale_fill_distiller(palette='BrBG', 
      limit=c(-1,1) * max(abs(df$rel.baseline.log))) +
    facet_grid(batch+donor ~ k.type + t.type + assay) +
    labs(title=paste(measure_i,'vs.',measure_j,'Individual replicates')))
}

deseq2_rc_measures <- comp.df[
  dcast(all.deseq.results.dt[,
    contrast := paste(ref_set, test_set, sep='_v_')][, 
      `:=`(log2FoldChange.scaled=scales::rescale(log2FoldChange)), 
      by=.(k.type, t.type, assay, contrast)], 
        k.type + t.type + assay + CAR.align ~ contrast, 
        value.var = c('log2FoldChange', 'padj')),
    on=c('k.type','t.type','assay','CAR.align')]

CAR score comparisons

plot_measure_pair(comp.df, 'vst_car_score.scaled', 'CAR.score.scaled')
## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

plot_measure_pair(comp.df, 'rlog_car_score.scaled', 'CAR.score.scaled')
## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

plot_measure_pair(comp.df, 'vst_car_score.scaled', 'rlog_car_score.scaled')
## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

CAR score to baseline abundance

plot_measure_pair(comp.df, 'CAR.score.scaled', 'car.abund.rel.baseline.scaled')
## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

plot_measure_pair(comp.df, 'vst_car_score.scaled', 'car.abund.rel.baseline.scaled')
## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

plot_measure_pair(comp.df, 'rlog_car_score.scaled', 'car.abund.rel.baseline.scaled')
## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

CAR score to prev abundance

plot_measure_pair(comp.df, 'CAR.score.scaled', 'car.abund.rel.prev.scaled')
## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

plot_measure_pair(comp.df, 'vst_car_score.scaled', 'car.abund.rel.prev.scaled')
## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

plot_measure_pair(comp.df, 'rlog_car_score.scaled', 'car.abund.rel.prev.scaled')
## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

CAR score to prev baseline

plot_measure_pair(comp.df, 'car.abund.rel.baseline.scaled', 'car.abund.rel.prev.scaled')
## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

## Warning in summary.lm(lm(data = .SD, as.formula(paste(measure_i, "~",
## measure_j)))): essentially perfect fit: summary may be unreliable

CAR score to deseq2

plot_measure_pair(deseq2_rc_measures,
  'log2FoldChange_D_v_A',
  'car.abund.rel.prev.scaled') +
  labs(title='DeSeq2 D:A vs Abundance-Prev')

plot_measure_pair(deseq2_rc_measures,
  'log2FoldChange_D_v_A',
  'car.abund.rel.prev.scaled') +
  labs(title='DeSeq2 D:A vs Abundance-Prev')

plot_measure_pair(deseq2_rc_measures,
  'log2FoldChange_D_v_A',
  'car.abund.rel.baseline.scaled') +
  labs(title='DeSeq2 D:A vs Abundance-Baseline')

plot_measure_pair(deseq2_rc_measures,
  'log2FoldChange_CD_v_AB',
  'car.abund.rel.prev.scaled') +
  labs(title='DeSeq2 CD:AB vs Abundance-Prev')

plot_measure_pair(deseq2_rc_measures,
  'log2FoldChange_CD_v_AB',
  'car.abund.rel.baseline.scaled') +
  labs(title='DeSeq2 CD:AB vs Abundance-Baseline')

plot_measure_pair(deseq2_rc_measures[, log2FoldChange_CD_v_A.scaled := scales::rescale(log2FoldChange_CD_v_A), by=.(k.type, t.type, assay)],
                  'log2FoldChange_CD_v_A.scaled',
                  'rlog_car_score.scaled') +
    labs(title='DeSeq2 CD:A vs R-log-scaled CAR score')
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf

The final plot - r-log-scaled car score to CD:A deseq ratio, seems to be the best correspondence I have seen.

multivariate linear regression

Not using this right now.

# bin_pcts <- dcast(
#   comp.df[, 
#     list(bin, cell.count, car.bin.pct, sort.group, 
#       assay_group= paste(.BY[c(2,3,4)], collapse='_')), 
#     by=c('CAR.align','assay','t.type','k.type')], 
#     CAR.align + assay + t.type + k.type + sort.group + assay_group ~ cell.count, 
#     value.var='car.bin.pct')
# 
# cell_counts <- dcast(comp.df[, 
#   list(bin, cell.count, car.bin.pct, sort.group, car.bin.read.pct, 
#     assay_group= paste(.BY[c(2,3,4)], collapse='_')), 
#       by=c('CAR.align','assay','t.type','k.type')][, 
#         cell.bin.count := round(car.bin.read.pct * cell.count)], 
#   CAR.align + assay + t.type + k.type + sort.group ~ bin,
#   value.var='cell.bin.count')
# 
# cell_totals <- dcast(
#   unique(comp.df[, list(bin, cell.count, sort.group)][, 
#     bin := paste(bin,'total',sep='.')])[, 
#     cell.pct := cell.count/sum(cell.count, na.rm=T), by=c('sort.group')], 
#   sort.group  ~ bin, 
#   value.var='cell.pct')
# 
# cell_pcts <- dcast(
#   cell_counts[, data.table(bins=c('A','B','C','D'), 
#     c(A,B,C,D)/sum(c(A,B,C,D), na.rm=T)), by=.(CAR.align, sort.group)], 
#     CAR.align + sort.group ~ bins, 
#     value.var='V2')
# 
# cell_counts[cell_pcts, on=c('CAR.align','sort.group')][cell_totals, on=c('sort.group')]
# 
# # cell_counts[cell_pcts, on=c('CAR.align','sort.group')][
# #     cell_totals, on=c('sort.group')][
# #   sort.group == 'prolif2_d2_4d_CTV1_CD8_pos' & CAR.align == 'TNR8', list(
# #   theta_hat= ddirichlet(c(i.A, i.B, i.C, i.D), c(A, B, C, D)), 
# #   theta=ddirichlet(c(A.total, B.total, C.total, D.total), c(A, B, C, D))), 
# #   by=c('CAR.align','sort.group')]

pos/neg CTV3, within t.type

all_rep_comparisons <- read.counts[
  grepl('CTV', assay), 
  data.table(matrix(combn(unique(sort.group), 2), ncol=2, byrow=T)),
  by=c('assay','t.type')]

names(all_rep_comparisons)[3:4] <- c('x.group','y.group')

all_rep_comparisons <- all_rep_comparisons[, 
  cast_comparison(read.counts, x.group, y.group)[,
    c('x.group','y.group') := NULL],
    by=c('assay','t.type','x.group','y.group')]

ggplot(all_rep_comparisons[assay == 'CTV3']) + 
  geom_point(aes(x_scaled_sep, y_scaled_sep)) + 
  facet_wrap(assay + t.type ~ x.group + y.group)
## Warning: Removed 3 rows containing missing values (geom_point).

pos/neg CTV3, within k.type

all_rep_comparisons <- read.counts[
  grepl('CTV', assay), 
  data.table(matrix(combn(unique(sort.group), 2), ncol=2, byrow=T)),
  by=c('assay','k.type')]

names(all_rep_comparisons)[3:4] <- c('x.group','y.group')

all_rep_comparisons <- all_rep_comparisons[, 
  cast_comparison(read.counts, x.group, y.group)[,
    c('x.group','y.group') := NULL],
    by=c('assay','x.group','y.group')]

ggplot(all_rep_comparisons[assay == 'CTV3']) + 
  geom_point(aes(x_scaled_sep, y_scaled_sep)) + 
  facet_wrap(assay ~ x.group + y.group)
## Warning: Removed 4 rows containing missing values (geom_point).

Pos Neg Comparisons

# remove prolif1.d2.ctv3 for now - CD4 has only 1 bin, D, 
# and KLRG1 is on top for CD19+ ...need to check this more

pos_neg_comparisons <- read.counts[grepl('CTV', assay), {
  if (length(unique(sort.group)) == 2)
    cast_comparison(
      .SD,
      unique(sort.group[k.type == 'neg']),
      unique(sort.group[k.type == 'pos']))
  },
  by=c('donor','assay','batch','t.type')][
    !(donor == 'd2' & batch == 'prolif1' & assay == 'CTV3')]

ggplot(pos_neg_comparisons[, prolif_donor := interaction(batch, donor)], 
       aes(x=x_scaled_comb, y=y_scaled_comb, label=CAR.align)) + 
    geom_point() + 
    facet_grid(prolif_donor+t.type~assay) + 
    geom_text_repel(data=pos_neg_comparisons[, 
      prolif_donor := interaction(batch, donor)][
         CAR.align %in% c('BAFF-R','TNR8','4-1BB','TACI','CD28','KLRG1','CD40')]) +
    labs(x='CD19- Proliferation (+/- scaled together, per replicate)', 
         y='CD19+ Proliferation (+/- scaled together, per replicate)')

ggplot(pos_neg_comparisons[, prolif_donor := interaction(batch, donor)], 
       aes(x=x_scaled_comb/y_scaled_comb, y=y_scaled_sep, label=CAR.align)) + 
    geom_point() + 
    facet_grid(prolif_donor+t.type~assay) + 
    geom_text_repel(data=pos_neg_comparisons[, 
      prolif_donor := interaction(batch, donor)][
         CAR.align %in% c('BAFF-R','TNR8','4-1BB','TACI','CD28','KLRG1','CD40')]) +
    labs(x='CD19+ Proliferation (+ scaled alone, per replicate)', 
         y='CD19- : CD19+ Proliferation Ratio')

pos_neg_comparisons_melt <- melt(
  pos_neg_comparisons[, 
    xy_ratio := x_scaled_comb/y_scaled_comb], measure.vars= grep(
  '^[xy][^.]', names(pos_neg_comparisons), perl=T, value=T)
  )[, 
    `:=`(var_mean= mean(value, na.rm=T), var_sd= sd(value, na.rm=T)), 
    by=c('t.type','assay','CAR.align','variable')]

cast_left <- paste(names(pos_neg_comparisons_melt)[1:6], collapse = ' + ')
cast_right <- 'variable'
cast_formula <- paste(cast_left, cast_right, sep= ' ~ ')

pos_neg_assay_avg <- dcast(
  pos_neg_comparisons_melt, cast_formula, value.var='var_mean')

pos_neg_assay_plot <- unique(
  pos_neg_assay_avg[, list(t.type, assay, xy_ratio, y_scaled_sep, CAR.align)])
pos_neg_assay_plot[, label.few := '']
pos_neg_assay_plot[
  CAR.align %in% c('BAFF-R','TNR8','4-1BB','TACI','CD28','KLRG1','CD40'),
  label.few := CAR.align]


ggplot(pos_neg_assay_plot, 
    aes(x=xy_ratio, y=y_scaled_sep, label=label.few)) + 
  geom_point() + 
  facet_wrap(t.type ~ assay) + 
  geom_text_repel() +
  labs(
    title='Average Prolifertation and -/+ ratio across replicates',
    y='CD19+ Proliferation (scaled for replicate)', 
    x='CD19- : CD19+ Proliferation Ratio')

ggplotly(ggplot(pos_neg_assay_plot, 
     aes(x=xy_ratio, y=y_scaled_sep, label=CAR.align)) + 
  geom_point() + 
  facet_wrap(t.type ~ assay) + 
  labs(
      title='Average Prolifertation and -/+ ratio across replicates (interactive)',
      y='CD19+ Proliferation (scaled for replicate)', 
      x='CD19- : CD19+ Proliferation Ratio'))

6 panel - all CTVs, CD4s and CD8s averaged separately

ggplot_pos_neg_assay <- ggplot(pos_neg_assay_plot, 
       aes(x=xy_ratio, y=y_scaled_sep, label=label.few)) + 
    geom_point() + 
    facet_grid(t.type ~ assay) + 
    scale_x_continuous(labels=percent) +
    geom_text_repel(size=2.5) +
    geom_vline(xintercept = -Inf) + geom_hline(yintercept = -Inf) +
    theme_minimal() +
    labs(
        title='Average Prolifertation and -/+ ratio across replicates',
        y='CD19+ Proliferation (Scaled)', 
        x='Relative Nonspecific Proliferation')

ggplot_pos_neg_assay

ggsave(here::here('..','figs','pooled','relative_prolif.pdf'), 
       ggplot_pos_neg_assay, height=4, width=7, useDingbats=FALSE)

2 panel - all CD4s and CD8s averaged separately

pos_neg_comparisons_melt <- melt(
  pos_neg_comparisons[, 
    xy_ratio := x_scaled_comb/y_scaled_comb], measure.vars= grep(
  '^[xy][^.]', names(pos_neg_comparisons), perl=T, value=T)
  )[, 
    `:=`(var_mean= mean(value, na.rm=T), var_sd= sd(value, na.rm=T)), 
    by=c('t.type','CAR.align','variable')]

cast_left <- paste(names(pos_neg_comparisons_melt)[1:6], collapse = ' + ')
cast_right <- 'variable'
cast_formula <- paste(cast_left, cast_right, sep= ' ~ ')

pos_neg_assay_avg <- dcast(
  pos_neg_comparisons_melt, cast_formula, value.var='var_mean')

pos_neg_assay_plot <- unique(
  pos_neg_assay_avg[, list(t.type, xy_ratio, y_scaled_sep, CAR.align)])
pos_neg_assay_plot[, label.few := '']
pos_neg_assay_plot[
  CAR.align %in% c('BAFF-R','TNR8','4-1BB','TACI','CD28','KLRG1','CD40'),
  label.few := CAR.align]

ggplot_pos_neg_assay <- ggplot(pos_neg_assay_plot, 
       aes(x=xy_ratio, y=y_scaled_sep, label=label.few)) + 
    geom_point() + 
    facet_grid(t.type ~ .) + 
    scale_x_continuous(labels=percent) +
    geom_text_repel(size=2.5) +
    geom_vline(xintercept = -Inf) + geom_hline(yintercept = -Inf) +
    theme_minimal() +
    labs(
        title='Average Prolifertation and -/+ ratio across replicates',
        y='CD19+ Proliferation (Scaled)', 
        x='Relative Nonspecific Proliferation')

##TODO Change labelled CARs to ones significant in the CD19+ panel

ggsave(here::here('..','figs','pooled','relative_prolif_t_mean.pdf'), 
       ggplot_pos_neg_assay, height=4, width=7, useDingbats=FALSE)

2 panel - +/- no ratio, all CD4s and CD8s averaged separately (BEST)

pos_neg_comparisons_melt <- melt(
  pos_neg_comparisons[, 
    xy_ratio := x_scaled_comb/y_scaled_comb], measure.vars= grep(
  '^[xy][^.]', names(pos_neg_comparisons), perl=T, value=T)
  )[, 
    `:=`(var_mean= mean(value, na.rm=T), var_sd= sd(value, na.rm=T)), 
    by=c('t.type','CAR.align','variable')]

cast_left <- paste(names(pos_neg_comparisons_melt)[1:6], collapse = ' + ')
cast_right <- 'variable'
cast_formula <- paste(cast_left, cast_right, sep= ' ~ ')

pos_neg_assay_avg <- dcast(
  pos_neg_comparisons_melt, cast_formula, value.var='var_mean')

pos_neg_assay_plot <- unique(
  pos_neg_assay_avg[, list(t.type, x_scaled_comb, y_scaled_comb, CAR.align)])
pos_neg_assay_plot[, label.few := '']
pos_neg_assay_plot[
  CAR.align %in% c('BAFF-R','TNR8','4-1BB','TACI','CD28','KLRG1','CD40'),
  label.few := CAR.align]

# simpler plot
ggplot_pos_neg_assay <- ggplot(pos_neg_assay_plot, 
       aes(x=x_scaled_comb, y=y_scaled_comb, label=label.few)) + 
    geom_point() + 
    facet_grid(t.type ~ .) + 
    scale_x_continuous(expand=expansion(mult=1, add=0), limits=c(0, NA)) +
    expand_limits(x=0) +
    geom_text_repel(size=2.5) +
    geom_vline(xintercept = -Inf) + geom_hline(yintercept = -Inf) +
    theme_minimal() +
    labs(
        title='Proliferation, +/- antigen',
        y='CD19+ Proliferation (scaled per replicate)', 
        x='CD19- Proliferation (scaled per replicate)')

# with axis cuts
ggplot_pos_neg_assay <- ggplot(pos_neg_assay_plot, 
       aes(x=x_scaled_comb, y=y_scaled_comb, label=label.few)) + 
    geom_point() + 
    facet_grid(t.type ~ .) + 
    scale_x_continuous(expand=expansion(mult=c(0,.1), add=0), limits=c(0,NA)) +
    scale_y_continuous(limits=c(0.48, 0.95), expand=expansion(mult=c(0,0), add=0), 
      breaks=(5:10)/10, labels=c('', (6:10)/10)) +
    geom_text_repel(size=2.5) +
    geom_vline(xintercept = -Inf) + geom_hline(yintercept = -Inf) +
    labs(
        title='Proliferation, +/- antigen',
        y='CD19+ Proliferation (scaled per replicate)', 
        x='CD19- Proliferation\n(scaled per replicate)') +
    theme_minimal() +
    theme(panel.spacing.y=unit(1.5, "lines"))

# fig2 style

# with axis cuts
ggplot_pos_neg_assay <- ggplot(pos_neg_assay_plot, 
       aes(x=x_scaled_comb, y=y_scaled_comb, label=label.few)) + 
    geom_point() + 
    facet_grid(t.type ~ .) + 
    scale_x_continuous(expand=expansion(mult=c(0,.1), add=0), limits=c(0,NA)) +
    scale_y_continuous(limits=c(0.48, 0.95), expand=expansion(mult=c(0,0), add=0), 
      breaks=(5:10)/10, labels=c('', (6:10)/10)) +
    geom_text_repel(size=2.5) +
    geom_vline(xintercept = -Inf) + geom_hline(yintercept = -Inf) +
    labs(
        title='Proliferation, +/- antigen',
        y='CD19+ Proliferation (scaled per replicate)', 
        x='CD19- Proliferation\n(scaled per replicate)') +
    theme_minimal() +
    theme(panel.spacing.y=unit(1.5, "lines"))

ggsave(here::here('..','figs','pooled','scaled_prolif_t_mean.pdf'), 
       ggplot_pos_neg_assay, height=6, width=3.6, useDingbats=FALSE)

2 panel - CTV1 only, +/- no ratio, all CD4s and CD8s averaged separately

pos_neg_comparisons_melt <- melt(
  pos_neg_comparisons[, 
    xy_ratio := x_scaled_comb/y_scaled_comb], measure.vars= grep(
  '^[xy][^.]', names(pos_neg_comparisons), perl=T, value=T)
  )[, 
    `:=`(var_mean= mean(value, na.rm=T), var_sd= sd(value, na.rm=T)), 
    by=c('t.type','CAR.align','variable', 'assay')]

cast_left <- paste(names(pos_neg_comparisons_melt)[1:6], collapse = ' + ')
cast_right <- 'variable'
cast_formula <- paste(cast_left, cast_right, sep= ' ~ ')

pos_neg_assay_avg <- dcast(
  pos_neg_comparisons_melt, cast_formula, value.var='var_mean')

pos_neg_assay_plot <- unique(
  pos_neg_assay_avg[, list(t.type, x_scaled_comb, y_scaled_comb, CAR.align, assay)])
pos_neg_assay_plot[, label.few := '']
pos_neg_assay_plot[
  CAR.align %in% c('BAFF-R','TNR8','4-1BB','TACI','CD28','KLRG1','CD40'),
  label.few := CAR.align]

ggplot_pos_neg_assay <- ggplot(pos_neg_assay_plot[assay == 'CTV1'], 
       aes(x=x_scaled_comb, y=y_scaled_comb, label=label.few)) + 
    geom_point() + 
    facet_grid(t.type ~ .) + 
    scale_x_continuous() +
    geom_text_repel(size=2.5) +
    geom_vline(xintercept = -Inf) + geom_hline(yintercept = -Inf) +
    theme_minimal() +
    labs(
        title='Average Prolifertation and -/+ ratio across replicates',
        y='CTV1 CD19+ Proliferation (Scaled)', 
        x='CTV1 CD19- Proliferation (Scaled)')

ggsave(here::here('..','figs','pooled','scaled_prolif_t_mean_ctv1.pdf'), 
       ggplot_pos_neg_assay, height=4, width=7, useDingbats=FALSE)

GGplotly interactive comparisons

Bin vs baseline measurements

ggplotly(make_volcanoes(all.deseq.results.dt[
  k.type == 'pos' & ref_set == 'baseline']), 
  tooltip = "label", session='knitr')
ggplotly(make_cd4_cd8(all.deseq.results.dt[
  k.type == 'pos' & ref_set == 'baseline']), 
  tooltip = "label", session='knitr')
ggplotly(make_timeseries(all.deseq.results.dt[
  k.type == 'pos' & ref_set == 'baseline']), 
  tooltip = "label", session='knitr')
ggplotly(make_pos_neg(all.deseq.results.dt[
  ref_set == 'baseline']), 
  tooltip = "label", session='knitr')

Interbin measurements

ggplotly(make_volcanoes(all.deseq.results.dt[
  k.type == 'pos' & ref_set != 'baseline']), 
  tooltip = "label", session='knitr')
ggplotly(make_cd4_cd8(all.deseq.results.dt[
  k.type == 'pos' & ref_set != 'baseline']), 
  tooltip = "label", session='knitr')
ggplotly(make_timeseries(all.deseq.results.dt[
  k.type == 'pos' & ref_set != 'baseline']), 
  tooltip = "label", session='knitr')

cd19+ vs cd19-

ggplotly(make_pos_neg(all.deseq.results.dt[
  ref_set == 'baseline']), 
  tooltip = "label", session='knitr')
ggplotly(make_pos_neg(all.deseq.results.dt[
  ref_set != 'baseline']), 
  tooltip = "label", session='knitr')

Figure 2 Panels

data.output.dir <- file.path(here::here(
  '..','..',
  's3-roybal-tcsl',
  'lenti_screen_compiled_data','data'))

load(
  file=file.path(data.output.dir, 'pooled_deseq2_data.Rdata'))


all.deseq.results.dt[, padj.disp := -log10(padj)]
all.deseq.results.dt[, lfc.disp := log2FoldChange]
all.deseq.results.dt[padj.disp > 10, padj.disp := Inf]
all.deseq.results.dt[abs(lfc.disp) > 5, lfc.disp := sign(lfc.disp) * Inf]

data.dt <- all.deseq.results.dt

data.dt[, CAR.type := 'other']
data.dt[CAR.align == '4-1BB', CAR.type := '4-1BB']
data.dt[CAR.align == 'CD28', CAR.type := 'CD28']
data.dt[, CAR.type := factor(
  CAR.type,levels=c('other','4-1BB','CD28'))]

Replicate Comparison

# combined rlog car score for CTV1 & CTV2, baseline abundance for CTV3
fig2_rep_comp <- rbind(
    plot_all_reps(
        df=read.counts[batch != 'post-cytof'], 
        value_col='rlog_car_score', df_only=T)[assay == 'CTV1'],
    plot_all_reps(
        df=read.counts[batch != 'post-cytof' & CAR.align != 'TNR8'], 
        value_col='car.abund.rel.baseline', df_only=T)[assay != 'CTV1'])

fig2_rep_comp[, assay := factor(assay, 
    labels=c('Initial Proliferation\n(d0-d3/d4)', 
             'Cumulative Proliferation\n(d0-d16)', 
             'Cumulative Proliferation,\n(d0-d24)'))]

replicate_comp_plot <- ggplot(data=fig2_rep_comp, 
       aes(x_scaled_sep, y_scaled_sep)) + 
    geom_point(aes(color=interaction(rep_pair, k.type)), size=0.8) +
    theme_minimal() + 
    theme(panel.border=element_rect(fill=NA)) + 
    facet_grid(t.type ~ assay) +
    scale_color_manual('', 
        values=brewer.pal(11,'PRGn')[c(2,3,4,10,9,8)],
        labels=c(
            'CD19-, A v B', 'CD19-, A v C', 'CD19-, B v C',
            'CD19+, A v B', 'CD19+, A v C', 'CD19+, B v C')) +
    labs(x='Scaled replicate 1', y='Scaled replicate 2')

replicate_comp_plot
## Warning: Removed 1 rows containing missing values (geom_point).

ggsave(here::here('..','figs','pooled','pooled_replicate_comparison.pdf'), 
       replicate_comp_plot, height=4, width=7, useDingbats=FALSE)
## Warning: Removed 1 rows containing missing values (geom_point).

Interbin Volcano

max_pval_y <- 6.5

#CD28 and 4-1BB Colors
receptor_cols <- RColorBrewer::brewer.pal(9, 'YlGnBu')[c(6,8)]

# label significant hits
data.dt[, CAR.type.size := CAR.type]
data.dt[CAR.type == 'other' & 
    ((padj < 0.05 & log2FoldChange > 0) | 
    (padj < 0.05 & log2FoldChange > 0)), 
  CAR.type.size := 'other_sig']

data.dt[, CAR.label.sig := '']
data.dt[CAR.type.size != 'other',
    CAR.label.sig := CAR.align]

# plot subset - CTV1, CTV2, and cumulative of all 3
data.subset.dt <- data.dt[
    k.type == 'pos' & 
        ((ref_set == 'baseline' & test_set == 'A' & assay != 'CTV1') | 
        (ref_set == 'CD' & test_set == 'A' & assay == 'CTV1'))]

# rename facets
data.subset.dt[, assay := factor(assay, 
    labels=c('Initial Proliferation\n(d0-d3/d4)', 
             'Cumulative Proliferation\n(d0-d16)', 
             'Cumulative Proliferation\n(d0-d24)'))]

# max out at log10-15
data.subset.dt[, padj.disp := -log10(padj)]
data.subset.dt[, CAR.pvalmax := padj.disp > max_pval_y]
data.subset.dt[padj.disp > max_pval_y, padj.disp := max_pval_y-0.18]

data.subset.dt[, order := ifelse(CAR.type=="other", 2, 1)]

interbin_volcano <- ggplot(data.subset.dt[order(CAR.type)], 
  aes(
    x=log2FoldChange, y=padj.disp,
    color=CAR.type.size,
    fill=CAR.type.size,
    label=CAR.label.sig,
    size=CAR.type.size,
    shape=CAR.pvalmax)) + 
  geom_point() + 
  facet_grid(t.type ~ assay) +
  scale_color_manual('',
    labels=c('New Receptors', '4-1BB', 'CD28', 'New Receptors'),
    values=c('grey50', receptor_cols, outlier_cols[2]),
    guide=F) +
  scale_fill_manual('',
    labels=c('New Receptors', '4-1BB', 'CD28', 'New Receptors'),
    values=c('grey50', receptor_cols, outlier_cols[2]),
    guide=F) +
  scale_shape_manual(values=c(21,24), guide=F) +
  scale_size_manual('',
    labels=c('New Receptors', '4-1BB', 'CD28', 'New Receptors'),
    values=c(1.5,3,3,2.5), guide=F) +
  geom_hline(yintercept=-log10(0.05), linetype = 'dashed', alpha=.3) +
  labs(x='Relative Proliferation,\nlog2 FC', 
       y='-log10(p)') + 
  scale_y_continuous(limits=c(0, max_pval_y), expand=c(.05,.05,0,0)) +
  scale_x_continuous(limits=c(-3.25,3.25), breaks=c(-3:3)) +
  geom_text_repel(size=3, point.padding=0.25, text.padding=0.25, segment.color='grey') +
  theme_minimal() +
  theme(panel.border=element_rect(fill=NA))
## Warning: Ignoring unknown parameters: text.padding
ggsave(here::here('..','figs','pooled','interbin_volcano.pdf'), 
       interbin_volcano, height=4, width=7, useDingbats=FALSE)

interbin_volcano

CD4 vs CD8s

cast_4v8 <- dcast(data.dt[
    k.type == 'pos' & 
        ((ref_set == 'baseline' & test_set == 'A' & assay != 'CTV1') | 
        (ref_set == 'CD' & test_set == 'A' & assay == 'CTV1'))], 
      CAR.align + assay + CAR.type ~ t.type, 
    value.var = c("log2FoldChange", "padj.disp"))

# rename facets
cast_4v8[, assay := factor(assay, 
    labels=c('Initial Proliferation\n(d0-d3/d4)', 
             'Cumulative Proliferation\n(d0-d16)', 
             'Cumulative Proliferation,\n(d0-d24)'))]

# label significant hits
cast_4v8[, CAR.type.size := CAR.type]
cast_4v8[CAR.type == 'other' & 
    ((padj.disp_CD4 > -log10(0.05) & log2FoldChange_CD4 > 0) | 
    (padj.disp_CD8 > -log10(0.05) & log2FoldChange_CD8 > 0)), 
  CAR.type.size := 'other_sig']

cast_4v8[, CAR.label.sig := '']
cast_4v8[CAR.type.size != 'other',
    CAR.label.sig := CAR.align]
    

prolif_4v8 <- ggplot(cast_4v8[order(CAR.type)], 
  aes(y=log2FoldChange_CD8, x=log2FoldChange_CD4,
      color=CAR.type,
      label=CAR.label.sig,
      size=CAR.type.size)) + 
  geom_point() +
  facet_grid(. ~  assay) +
  scale_color_manual('',
    labels=c('New Receptors', '4-1BB', 'CD28'),
    values=c('grey50', receptor_cols), guide=F) +
  scale_size_manual('',
    labels=c('New Receptors', '4-1BB', 'CD28', 'New Receptors'),
    values=c(0.8,3,3,2.5), guide=F) +
  geom_abline(linetype='dashed') +
  geom_text_repel() +
  theme_minimal() + 
  labs(x='CD4, relative log2 fold change', 
       y='CD8, relative log2 fold change') +
  theme(panel.border=element_rect(fill=NA)) +
  guides(size=F)

prolif_4v8

ggsave(here::here('..','figs','pooled','mixed_4v8.pdf'), prolif_4v8,
       height=2.5, width=7)

CD4 vs CD8s (new version, w/ mean prolif and diff)

cast_4v8[, mean_lfc := rowMeans(cbind(log2FoldChange_CD4, log2FoldChange_CD8))]
cast_4v8[, rel_8 := log2FoldChange_CD8 - log2FoldChange_CD4]

cast_4v8[CAR.type.size == 'other_sig' & xor(padj.disp_CD4 > -log10(0.05), 
  padj.disp_CD8 > -log10(0.05)), CAR.type.size := 'other_sig_diff']

prolif_4v8_rel <- ggplot(cast_4v8[order(CAR.type)], 
  aes(y=rel_8, x=mean_lfc,
      color=CAR.type.size,
      label=CAR.label.sig,
      size=CAR.type.size)) + 
  annotate(geom='label', x=-Inf, y=Inf, label='CD8 Skewed', 
    label.r=unit(0,'pt'), size=3, vjust=.99, hjust=.01) +
  annotate(geom='label', x=-Inf, y=-Inf, label='CD4 Skewed', 
    label.r=unit(0,'pt'), size=3, vjust=.01, hjust=.01) +
  geom_point() +
  facet_grid(. ~  assay, scales='free_x') +
  scale_color_manual('',
    labels=c('New Receptors', '4-1BB', 'CD28', 'New Receptors', 'New Receptors'),
    values=c('grey50', receptor_cols, outlier_cols[2], outlier_cols[2]), guide=F) +
  scale_size_manual('',
    labels=c('New Receptors', '4-1BB', 'CD28', 'New Receptors', 'New Receptors'),
    values=c(0.8,2.5,2.5,1,2.5), guide=F) +
  scale_x_continuous(limits=c(-3.25,3.25), breaks=c(-3:3)) +
  scale_y_continuous(limits=c(-2.25,1.9)) +
  geom_hline(yintercept=0, linetype='dashed', alpha=.7) +
  geom_text_repel(size=3, point.padding=0.25, segment.color='grey') +
  theme_minimal() + 
  labs(y='log2 CD8:CD4 proliferation ratio', 
       x='Relative proliferation,\n log2 mean FC') +
  theme(panel.border=element_rect(fill=NA)) +
  guides(size=F)

ggsave(here::here('..','figs','pooled','mixed_4v8_rel.pdf'), prolif_4v8_rel,
       height=2.5, width=7)
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
prolif_4v8_rel
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_text_repel).

CD19+ vs CD19-

cast_posneg <- dcast(data.dt[
      ((ref_set == 'baseline' & test_set == 'ABCD' & assay != 'CTV1') | 
      (ref_set == 'D' & test_set == 'A' & assay == 'CTV1'))], 
    CAR.align + assay + CAR.type + t.type ~ k.type, 
  value.var = c("log2FoldChange", "padj.disp"))
    
# rename facets
cast_posneg[, assay := factor(assay, 
    labels=c('Initial Proliferation\n(d0-d3/d4)', 
             'Cumulative Proliferation\n(d0-d16)', 
             'Cumulative Proliferation,\n(d0-d24)'))]

# label significant hits
cast_posneg[, CAR.type.size := CAR.type]
cast_posneg[CAR.type == 'other' & (
    (2^(-padj.disp_pos) < 0.05 & log2FoldChange_pos > 0) | 
    (2^(-padj.disp_neg) < 0.05 & log2FoldChange_neg > 0)
  ), 
  CAR.type.size := 'other_sig']

prolif_posneg <- ggplot(cast_posneg, 
  aes(y=log2FoldChange_pos, x=log2FoldChange_neg,
      color=CAR.type,
      label=CAR.align,
      size=CAR.type.size)) + 
  geom_point() +
  facet_grid(t.type ~  assay) +
  scale_color_manual('',
    labels=c('New Receptors', '4-1BB', 'CD28'),
    values=c('grey50', receptor_cols)) +
  scale_size_manual('',
    labels=c('New Receptors', '4-1BB', 'CD28', 'New Receptors'),
    values=c(0.8,3,3,2)) +
  geom_abline(linetype='dashed') +
  theme_minimal() + 
  labs(y='Relative log2 fold change, with antigen', 
       x='Relative log2 fold change, no antigen') +
  theme(panel.border=element_rect(fill=NA)) +
  guides(size=F)

prolif_posneg

ggsave(here::here('..','figs','pooled','mixed_posneg.pdf'), prolif_posneg,
       height=4.5, width=7)

Dendroheatmap from #02, copied here to make fig2 panels

Single heatmap, CD4 plus CD8.

##deseq2

all.deseq.results.dt[, comparison := paste(test_set,ref_set,sep= '.')]

deseq2_combined <- all.deseq.results.dt[,
    comparison := paste(test_set, ref_set, sep='.')][,
    combined_var := paste(comparison, assay)][, value := log2FoldChange]

deseq2_combined[, value.scale := scale(value), by = .(t.type, k.type, assay, comparison)]

deseq2_combined[, 
  car.axis := paste(CAR.align,t.type,k.type, sep='|'), by=.(t.type)]

## rlog car score

car_score_means <- read.counts[assay != 'baseline' & batch != 'post-cytof'][,
  value.scale := scales::rescale(rlog_car_score), by=sort.group][,
    value.mean :=  mean(value.scale, na.rm=T), 
      by=.(CAR.align, t.type, k.type, assay)][,
        value.scale := scale(value.mean), by=.(t.type, k.type, assay)]

car_score_means[, 
  car.axis := paste(CAR.align,t.type,k.type, sep='|'), by=.(t.type)]
car_score_means[, comparison := 'rlog_car_score']
car_score_means[, 
  combined_var := paste(assay,comparison, sep='.'), by=.(t.type)]

# fc in proliferation
rel_abund_means <- read.counts[assay != 'baseline' & batch != 'post-cytof'][,
  value.scale := scales::rescale(car.abund.rel.baseline), by=sort.group][,
    value.mean :=  mean(value.scale, na.rm=T), 
      by=.(CAR.align, t.type, k.type, assay)][,
        value.scale := scale(value.mean), by=.(t.type, k.type, assay)]

rel_abund_means[, 
  car.axis := paste(CAR.align,t.type,k.type, sep='|'), by=.(t.type)]
rel_abund_means[, comparison := 'abund_rel_baseline']
rel_abund_means[, 
  combined_var := paste(assay,comparison, sep='.'), by=.(t.type)]



### Cytokines

cytokine_ranks <- melt(
  read.counts[batch == 'post-cytof' & assay != 'IL4'], 
  measure.vars = c('min.max.ratio.norm','all.max.ratio.norm','rlog_car_score'),
  variable.name='comparison')[bin == 'A' & k.type == 'pos']
cytokine_ranks[, value.scale := scale(value), by=.(sort.group,comparison,t.type)]
cytokine_ranks[, 
  car.axis := paste(CAR.align,t.type,k.type, sep='|'), by=.(t.type)]
cytokine_ranks[, 
  combined_var := paste(assay,comparison, sep='.'), by=.(t.type)]

### Combined

shared_cols <- intersect(
  intersect(names(deseq2_combined), names(cytokine_ranks)), 
    names(car_score_means))

total_combined <- unique(rbind(
  deseq2_combined[, shared_cols, with=F],
  cytokine_ranks[comparison == 'rlog_car_score', shared_cols, with=F],
  car_score_means[comparison == 'rlog_car_score', shared_cols, with=F],
  rel_abund_means[comparison == 'abund_rel_baseline', shared_cols, with=F],
  use.names=F))


# relabel x axis assays
#total_combined[, assay := factor(assay, 
#  labels=c('CTV1','CTV2','CTV3','CD69','IFNy','IL2'))]

Together and separate CD4 and CD8 heatmaps.

library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.13.4
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:data.table':
## 
##     set
## The following object is masked from 'package:stats':
## 
##     cutree
library(ggdendro)
## 
## Attaching package: 'ggdendro'
## The following object is masked from 'package:dendextend':
## 
##     theme_dendro
library(cowplot)
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
library(gtable)
library(grid)

make_dendroheatmap <- function(
  df, title, dendro_units=grid::unit(0.2, "null"), 
  legend_theme=theme(legend.position = "none"))
{
  
  # Make dendrogram
  car_mean_cast <- dcast(
    df[k.type == 'pos'], 
    CAR.align ~ assay + t.type, 
    value.var='value.scale')
  
  car_dendro_m <- as.matrix(car_mean_cast[, -c(1:2)])
  rownames(car_dendro_m) <- unlist(car_mean_cast[,1])
  car_dendro <- as.dendrogram(hclust(d = dist(x = car_dendro_m)))
  car_dendro <- rotate(car_dendro, names(sort(rowMeans(car_dendro_m))))

  # Create dendrogram plot
  dendro_vars_plot <- ggdendrogram(data = car_dendro, rotate = TRUE) + 
    theme(axis.text.y = element_text(size = 6))
  
  # row order
  df$CAR.align <- factor(
      df$CAR.align,
      levels = labels(car_dendro))
  
  #manual x labels
  x_lab_man <- c('CD69 18h',
    'Prolif d3-4','Prolif d14-16','Prolif d24',
    'IFNy d3', 'IL2 d3', 'CD69 18h',
    'Prolif d4','Prolif d14-16','Prolif d24')
  
  # heatmap
  var_heatmap <- ggplot(df) + 
    geom_tile(aes(x = interaction(assay,t.type), y = CAR.align, 
                  fill = value.scale), colour = "black", size = .20) + 
    scale_fill_distiller('Z-score', 
      palette='PiYG', limits=c(-3,3), oob=scales::squish, direction=1) +
    scale_x_discrete(expand=c(0,0),
      labels=x_lab_man) +
    scale_y_discrete(labels= (
        function (breaks)
          unlist(lapply(breaks, function(str)
            strsplit(str, '|',fixed=T)[[1]][1])))) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        axis.title.x = element_blank(),
        axis.title.y = element_blank()) +
    labs(title=title)
  
    var_heatmap <- var_heatmap + legend_theme
  
  # col dendro
  dendro_data_col <- dendro_data(car_dendro, type = "rectangle")
  
  dendro_col <- axis_canvas(var_heatmap, axis = "y", coord_flip=T) + 
    geom_segment(data = segment(dendro_data_col), 
        aes(x = x, y = y, xend = xend, yend = yend)) +
      coord_flip()
  plot_dendroheat <- insert_yaxis_grob(var_heatmap, 
    dendro_col, dendro_units, position = "right")
  return(ggdraw(plot_dendroheat))
}

# make_dendroheatmap(total_combined[xor(comparison == 'rlog_car_score', grepl('CTV',assay)) & k.type == 'pos'],
#   title='Deseq2 for CTV, car score for markers')
# make_dendroheatmap(total_combined[comparison == 'rlog_car_score' & k.type == 'pos'],
#   title='car score for all')
# 
# make_dendroheatmap(total_combined[(assay == 'CTV1' & comparison == 'rlog_car_score') | (assay %in% c('CTV2','CTV3') & comparison == 'ABCD.baseline') | (!(assay %in% c('CTV1','CTV2','CTV3')) & comparison == 'rlog_car_score')],
#   title='same_as_other_fig2_panels')
# 
# make_dendroheatmap(total_combined[t.type == 'CD4' & ((assay == 'CTV1' & comparison == 'rlog_car_score') | (assay %in% c('CTV2','CTV3') & comparison == 'ABCD.baseline') | (!(assay %in% c('CTV1','CTV2','CTV3')) & comparison == 'rlog_car_score'))],
#   title='same_as_other_fig2_panels - cd4 only')
# 
# make_dendroheatmap(total_combined[t.type == 'CD8' & ((assay == 'CTV1' & comparison == 'rlog_car_score') | (assay %in% c('CTV2','CTV3') & comparison == 'ABCD.baseline') | (!(assay %in% c('CTV1','CTV2','CTV3')) & comparison == 'rlog_car_score'))],
#   title='same_as_other_fig2_panels - cd8 only')
# 
# make_dendroheatmap(total_combined[xor(comparison == 'rlog_car_score', grepl('CTV',assay)) & k.type == 'pos' & t.type == 'CD4'],
#   title='CD4 only, Deseq2 for CTV, car score for markers')
# make_dendroheatmap(total_combined[comparison == 'rlog_car_score' & k.type == 'pos' & t.type == 'CD4'],
#   title='CD4 only, car score for all')
# 
# make_dendroheatmap(total_combined[xor(comparison == 'rlog_car_score', grepl('CTV',assay)) & k.type == 'pos' & t.type == 'CD8'],
#   title='CD8 only, Deseq2 for CTV, car score for markers')
# make_dendroheatmap(total_combined[comparison == 'rlog_car_score' & k.type == 'pos' & t.type == 'CD8'],
#   title='CD8 only, car score for all')

## combined 4 and 8 with separate dendrograms

#chosen metrics 1
# chosen_metrics <- total_combined[k.type == 'pos' &
#   ((assay == 'CTV1' & comparison == 'A.CD') | 
#     (assay %in% c('CTV2','CTV3') & comparison == 'ABCD.baseline') | 
#       (!(assay %in% c('CTV1','CTV2','CTV3')) & comparison == 'rlog_car_score'))]

#chosen metrics 2
chosen_metrics <- total_combined[k.type == 'pos' &
  ((assay == 'CTV1' & comparison == 'A.baseline') |
    (assay %in% c('CTV2','CTV3') & comparison == 'abund_rel_baseline') |
      (!(assay %in% c('CTV1','CTV2','CTV3')) & comparison == 'rlog_car_score'))]

fig_2a_single_dendro <- make_dendroheatmap(
  chosen_metrics,
  title='',
  legend_theme=theme(
      legend.position = c(1, 0), 
      legend.justification = c(-1.7, 0),
      plot.margin = unit(c(5.5,33,33,5.5), "pt")))

fig_2a_single_dendro

### proliferation plot

library(ggnewscale)

prolif_data <- read.counts[
  (k.type =='pos' | is.na(k.type)) & batch != 'post-cytof'][,
    list(batch, donor, 
      mean.car.abund.rel.baseline= mean(log2(car.abund.rel.baseline)), 
      sd.car.abund.rel.baseline= 2^sd(log2(car.abund.rel.baseline))), 
    by=c('CAR.align', 't.type','assay')]

#reorder CAR names to put CD28 and 41BB in front (at end of levels list)
new_car_levels  <- prolif_data[, c(
  levels(CAR.align)[!(levels(CAR.align) %in% c('4-1BB','CD28'))], 
  '4-1BB','CD28')]

prolif_data[, CAR.align := factor(CAR.align, new_car_levels)]

prolif_data[, CAR.type := 'other']
prolif_data[CAR.align == '4-1BB', CAR.type := '4-1BB']
prolif_data[CAR.align == 'CD28', CAR.type := 'CD28']
prolif_data[, CAR.type := factor(
  CAR.type,levels=c('other','4-1BB','CD28'))]

top_num <- 6

label_data <- unique(prolif_data[assay=='CTV3', 
  list(t.type, CAR.align, mean.car.abund.rel.baseline, assay, CAR.type)])[, 
    car_order := rank(mean.car.abund.rel.baseline), by='t.type'][
      car_order %in% c(1:(1+top_num),(40-top_num):40) | 
        CAR.align %in% c('CD28','41BB')]

#color labels pink vs green
label_data[, CAR.topbot := CAR.type]
label_data[car_order %in% 1:(1+top_num) & CAR.type == 'other', 
  CAR.topbot :='top']
label_data[car_order %in% (40-top_num):40 & CAR.type == 'other',
  CAR.topbot :='bot']
label_data[, CAR.topbot := factor(CAR.topbot)]

fig2_abundance_over_time <- ggplot(prolif_data[order(CAR.type)], aes(
    y=mean.car.abund.rel.baseline, x=assay,
    color=CAR.type,
    group=CAR.align,
    label=CAR.align)) + 
  # line
  geom_line(aes(size=CAR.type)) +
  scale_size_manual('',
    labels=c('New Receptors', '4-1BB', 'CD28'),
    values=c(0.5,1.5,1.5), guide=F) +
  scale_color_manual('',
    labels=c('New Receptors', '4-1BB', 'CD28'),
    values=c('grey30', receptor_cols),
    guide=F) +
  #points
  new_scale('size') +
  geom_point(aes(size=CAR.type, color=CAR.type, group=CAR.align)) +
  scale_size_manual('',
    labels=c('New Receptors', '4-1BB', 'CD28'),
    values=c(1,2,2), guide=F) +  
  # text
  new_scale('color') +
  geom_text_repel(data=label_data,
    aes(color=CAR.topbot),
    nudge_x = 1.2,
    direction = "y",
    hjust = 1,
    size = 3,
    segment.size = 0.3,
    segment.color='grey') +
  scale_color_manual('',
    labels=c('4-1BB', 'CD28','Top','Bottom'),
    values=c(receptor_cols, outlier_cols),
    guide=F) +
  facet_grid(. ~ t.type) +
  scale_x_discrete(
    'Days of continuous CD19 stimulation', 
    expand = c(0, 0.4, 0, 1.4), 
    labels=c('Baseline', 'Day 3-4', 'Day 14-16', 'Day 24')) +
  scale_y_continuous('Relative Proliferation,\n log2 FC') +
  theme_minimal() +
  theme(panel.border=element_rect(fill=NA))


ggsave(here::here('..','figs','pooled','abund_over_time.pdf'), fig2_abundance_over_time,
       height=3, width=8)

fig2_abundance_over_time

Cytokines from pooled 02

# cd19+/cd19- cols

cytokine.pct.pos <- dcast(
  read.counts[batch == "post-cytof" & t.type == 'CD4'][
    bin %in% c('C','D'), list(pct_pos= sum(car.bin.pct)), 
      by=c('assay','CAR.align', 'k.type')],
  assay + CAR.align ~ k.type, value.var='pct_pos')

cytokine.pct.pos[, CAR.align.assay := factor(paste(CAR.align,assay,sep='|'))]
cytokine.pct.pos[, CAR.align.assay := factor(
  CAR.align.assay, 
  levels=levels(CAR.align.assay),
  labels=cytokine.pct.pos[, gsub('(.*)\\..*', '\\1', levels(CAR.align.assay))])]

cytokine_panel <- ggplot(cytokine.pct.pos[assay %in% c("IL2","IFNy")]) + 
  geom_segment(aes(x = reorder(CAR.align.assay, neg-pos), 
                   xend = reorder(CAR.align.assay, neg-pos), 
                   y = neg, 
                   yend = pos), color="grey") +
  geom_point(
    aes(x = reorder(CAR.align.assay, neg-pos), y = pos, color='CD19+'), 
    size=2.5) +
  geom_point(
    aes(x = reorder(CAR.align.assay, neg-pos), y = neg, color='CD19-'), 
    size=2.5) + 
  scale_color_manual(
      values=c('CD19+'=cd19_val[2], 'CD19-'='grey30')) + 
  facet_wrap(~ assay, scales='free_x') +
  scale_y_continuous(label=percent) +
  scale_x_discrete(labels= (
        function (breaks)
          unlist(lapply(breaks, function(str)
            strsplit(str, '|',fixed=T)[[1]][1])))) +
  labs(y = '% Positive (ICS)') +
  theme_minimal() +
  theme(panel.border=element_rect(fill=NA),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5),
        axis.title.x = element_blank(),
        panel.grid.major.x = element_blank(),
        strip.background = element_rect(colour="white", fill="white"),
        legend.title = element_blank(),
        legend.justification=c(1,1),
        legend.position=c(0.99,1.01))

ggsave(here::here('..','figs','pooled','cytokine_plot.pdf'), cytokine_panel,
       height=3, width=8)

cytokine_panel

### Combined panels for figure 2

#old version with replicate comparison
# plot_grid(fig_2a_single_dendro, 
#   plot_grid(
#     replicate_comp_plot + coord_fixed(ratio=1),
#     interbin_volcano + coord_fixed(ratio=1),
#     prolif_4v8 + coord_fixed(ratio=1),
#     rel_heights=c(2.3, 2.3, 1.3),
#     rel_widths=c(1,1,1),
#     labels = c("B", "C", "D"), align='v', axis='lr',
#     ncol = 1),
#   labels=c('A',''))

bcd_add_left_margin <- theme(plot.margin = unit(c(5.5,5.5,5.5,22), "pt"))

plot_bcd <- plot_grid(
  fig2_abundance_over_time + bcd_add_left_margin, #b
  interbin_volcano + bcd_add_left_margin,         #c
  prolif_4v8_rel + bcd_add_left_margin,           #d
  rel_heights=c(1.6, 2.3, 1.6),
  rel_widths=c(1,1,1),
  labels = c("B", "C", "D"), align='v', axis='lr',
  ncol = 1)
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
plot_abcd <- plot_grid(
  make_dendroheatmap(
    chosen_metrics,
    title='',
    legend_theme=theme(
      legend.position = c(1, -0.08), 
      legend.justification = c(-0.6, 0),
      axis.ticks = element_blank(),
      axis.text.x = element_text(angle=90, vjust = 0.5, hjust=1),
      plot.margin = unit(c(5.5,38,65,5.5), "pt"))),
  plot_bcd, 
  rel_widths=c(0.7,1),
  labels=c('',''))

plot_abcde <- plot_grid(plot_abcd, 
  cytokine_panel + 
    guides(colour = guide_legend(reverse=T)) +
    theme(plot.margin = unit(c(0.5,1,1,0.8), "cm")),
  ncol=1,
  rel_heights=c(1.1,0.3), labels=c('A','E'))

ggsave(here::here('..','figs','compiled','fig2.pdf'), plot_abcde,
       height=15, width=11)

plot_abcde

## Fig 3 PCA

cast_metrics <- dcast(
  total_combined, CAR.align ~ group + assay + k.type + t.type + comparison, 
  value.var='value.scale')

combined_pca <- prcomp(cast_metrics[,-1])

# calculate pca stats
pca.dt <- data.table(pc = data.table(colnames(combined_pca$rotation))[, 
    PC := as.integer(gsub("[A-Z]", "", V1))][, PC], 
    sd = combined_pca$sdev, 
    var = combined_pca$sdev^2, 
    var.norm = combined_pca$sdev^2/sum(combined_pca$sdev^2), 
    var.acc = cumsum(combined_pca$sdev^2/sum(combined_pca$sdev^2)))

melt.pca.dt <- melt(
  pca.dt, measure.vars = c("var.norm", "var.acc"), variable.name = "metric")

pca.stats.plot <- ggplot(melt.pca.dt) +
  geom_line(aes(x = pc, y = value, color = metric)) +
  geom_point(aes(x = pc, y = value, color = metric)) +
  scale_x_continuous(limits = c(1, NA)) +
  labs(title = "Fraction of Variance Captured by Principal Components, Every Sort Group", 
       x = "Principal Component", y = "Fraction of Variance") +
  theme_bw()

# project data onto principal components
projected.metrics <- scale(cast_metrics[, setdiff(names(cast_metrics), 
  c("CAR.align")), with = FALSE],
  combined_pca$center, combined_pca$scale) %*% combined_pca$rotation

projected.metrics <- cbind.data.frame(CAR.align=cast_metrics[, CAR.align],
  projected.metrics)

projected.metrics <- data.table(merge(projected.metrics, 
  unique(read.counts[, .(CAR.align, len)]), 
  by = "CAR.align"))

projected.metrics[CAR.align == '4-1BB', CAR.align := '41BB']
projected.metrics[, CAR.annot := 'none']
projected.metrics[CAR.align %in% names(car_colors), CAR.annot := CAR.align]
projected.metrics[, CAR.annot := factor(
  CAR.annot, levels=c(names(car_colors)[1:7], 'none'))]

pca.pos.sort.group.plot <- ggplot(projected.metrics,
    aes(x = PC1, y = PC2, label = CAR.align, 
      size=(CAR.align %in% names(car_colors)), 
      color=CAR.annot)) +
  geom_point() +
  geom_text_repel(data=projected.metrics[CAR.annot != 'none'],
    size=4, color='grey20', box.padding = 0.5, segment.alpha=0) +
  scale_size_discrete(guide=F) +
  scale_color_manual('',
    labels=c(names(car_colors)[1:7], ''),
    values=c(car_colors[1:7], 'grey')) +
  labs(title = 'PCA Using CD19+ Sort Group Bins') +
  theme_bw() +
  labs(title='') +
  theme(axis.text = element_blank())
## Warning: Using size for a discrete variable is not advised.
ggsave(here::here('..','figs','new_pca.pdf'), pca.pos.sort.group.plot,
       height=4, width=5)

pca.pos.sort.group.plot

Save workspace for later analyses

data.output.dir <- file.path(here::here(
  '..','..',
  's3-roybal-tcsl',
  'lenti_screen_compiled_data','data'))
save(list=ls(),
  file=file.path(data.output.dir, 'pooled_deseq2_analysis_data.Rdata'))